knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
nypd = read.csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD")
#Remove the columns that I am not interested in for this project
nypd <- nypd %>%
select(-c(Latitude, Longitude, Lon_Lat, X_COORD_CD, Y_COORD_CD))
#Create some new columns that I will use for analysis
nypd <- nypd %>%
mutate(OCCUR_DATE = mdy(OCCUR_DATE),
OCCUR_TIME = hms(OCCUR_TIME),
OCCUR_HOUR = as.integer(OCCUR_TIME@hour),
OCCUR_MONTH = as.integer(month(OCCUR_DATE)),
OCCUR_YEAR = year(OCCUR_DATE),
STATISTICAL_MURDER_FLAG = as.logical(STATISTICAL_MURDER_FLAG),
SHOOTINGS = 1,
MURDER = as.integer(STATISTICAL_MURDER_FLAG))
#Handle missing data
nypd[nypd == ''] <- NA
nypd <- nypd %>%
replace_na(list(PERP_AGE_GROUP = "UNKNOW", PERP_RACE = "UNKNOWN", PERP_SEX = "UNKNOWN", VIC_AGE_GROUP = "UNKNOWN", VIC_SEX="UNKNOWN", VIC_RACE="UNKNOWN"))
#color plot
ggplot(nypd, aes(x=as.factor(BORO), fill=as.factor(BORO))) +
geom_bar() +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")
#Barplot by Boro, stacked by murder
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
geom_bar(position = "stack")
#Barplot by Boro, stacked by murder percentage
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
geom_bar(position = "fill")
#Create a new df for time series analysis
nypd_annual <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
select(OCCUR_YEAR, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
nypd_annual_murder <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
MURDER = sum(MURDER)) %>%
select(OCCUR_YEAR, SHOOTINGS, MURDER) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
#Create a new df for boro time series analysis
nypd_annual_boro <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS, BORO)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
select(OCCUR_YEAR, BORO, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR', 'SHOOTINGS'. You can override
## using the `.groups` argument.
#Timeseries lineplot
ggplot(nypd_annual, aes(x=OCCUR_YEAR)) +
geom_line(aes(y=SHOOTINGS)) +
geom_point(aes(y=SHOOTINGS)) +
geom_line(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
geom_point(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
labs(title = "NYPD Shootings (Black) and Murders (Red) by Year",
x = "Year",
y = "Shootings/Murders")
#Timeseries by boro
ggplot(nypd_annual_boro, aes(x=OCCUR_YEAR, y=SHOOTINGS, color=BORO)) +
geom_line() +
geom_point() +
labs(title = "NYPD Shootings by Year by Boro",
x = "Year",
y = "Shootings")
#histogram by hour
ggplot(nypd, aes(x=as.factor(OCCUR_HOUR))) +
geom_bar(fill="blue")
#histogram by month
ggplot(nypd, aes(x=as.factor(OCCUR_MONTH))) +
geom_bar(fill="red")
#2d histogram by month and time of day
fig1 <- plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour")
fig1 <- fig1 %>%
colorbar(title = "Shootings") %>%
layout(title = 'Time of Day vs. Month', plot_bgcolor = "#e5ecf6", xaxis=list(title = "Time (Local)"), yaxis = list(title = "Month"))
fig1
#putting it all together
s <- subplot(
plot_ly(x = nypd$OCCUR_HOUR, type = "histogram"),
plotly_empty(),
plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour"),
plot_ly(y = nypd$OCCUR_MONTH, type = "histogram"),
nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), margin = 0,
shareX = TRUE, shareY = TRUE, titleX = FALSE, titleY = FALSE
)
## Warning: No trace type specified and no positional attributes specified
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- layout(s, showlegend = FALSE)
fig
I decided to create a basic logistic regression model to try to predict whether a shooting was a murder or not. Logistic regressions are a generalized linear regression model for classification. In this case, it is binary since it is either a murder (1) or not a murder (0).
nypd_factor <- nypd %>%
filter(PERP_SEX == 'M' | PERP_SEX == 'F') %>%
filter(PERP_RACE != "UNKNOWN")%>%
filter(PERP_AGE_GROUP != "UNKNOWN")%>%
filter(VIC_SEX == 'M' | VIC_SEX == 'F') %>%
filter(VIC_RACE != "UNKNOWN")%>%
filter(VIC_AGE_GROUP != "UNKNOWN")%>%
mutate(PERP_AGE_GROUP = as.factor(PERP_AGE_GROUP))%>%
mutate(PERP_SEX = as.factor(PERP_SEX))%>%
mutate(PERP_RACE = as.factor(PERP_RACE))%>%
mutate(VIC_AGE_GROUP = as.factor(VIC_AGE_GROUP))%>%
mutate(VIC_SEX = as.factor(VIC_SEX))%>%
mutate(VIC_RACE = as.factor(VIC_RACE))
log_reg <- glm(MURDER ~ OCCUR_HOUR + OCCUR_MONTH + PERP_SEX +PERP_AGE_GROUP + PERP_RACE + VIC_AGE_GROUP + VIC_SEX + VIC_RACE, data = nypd_factor, family = "binomial")
summary(log_reg)
##
## Call:
## glm(formula = MURDER ~ OCCUR_HOUR + OCCUR_MONTH + PERP_SEX +
## PERP_AGE_GROUP + PERP_RACE + VIC_AGE_GROUP + VIC_SEX + VIC_RACE,
## family = "binomial", data = nypd_factor)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.396289 295.041851 -0.083 0.93410
## OCCUR_HOUR -0.001751 0.002470 -0.709 0.47838
## OCCUR_MONTH 0.007696 0.006330 1.216 0.22405
## PERP_SEXM -0.140780 0.114224 -1.232 0.21777
## PERP_AGE_GROUP1020 -11.151815 324.743707 -0.034 0.97261
## PERP_AGE_GROUP18-24 0.133598 0.073666 1.814 0.06975 .
## PERP_AGE_GROUP224 -11.267741 324.743710 -0.035 0.97232
## PERP_AGE_GROUP25-44 0.402200 0.075045 5.359 8.35e-08 ***
## PERP_AGE_GROUP45-64 0.716878 0.112075 6.396 1.59e-10 ***
## PERP_AGE_GROUP65+ 0.855667 0.282559 3.028 0.00246 **
## PERP_AGE_GROUP940 -11.321904 324.743710 -0.035 0.97219
## PERP_RACEASIAN / PACIFIC ISLANDER 11.895818 229.569261 0.052 0.95867
## PERP_RACEBLACK 11.509026 229.569185 0.050 0.96002
## PERP_RACEBLACK HISPANIC 11.410831 229.569196 0.050 0.96036
## PERP_RACEWHITE 12.085766 229.569229 0.053 0.95801
## PERP_RACEWHITE HISPANIC 11.609942 229.569191 0.051 0.95967
## VIC_AGE_GROUP1022 -10.979533 324.743705 -0.034 0.97303
## VIC_AGE_GROUP18-24 0.222279 0.076738 2.897 0.00377 **
## VIC_AGE_GROUP25-44 0.304456 0.076236 3.994 6.51e-05 ***
## VIC_AGE_GROUP45-64 0.308698 0.100047 3.086 0.00203 **
## VIC_AGE_GROUP65+ 0.495994 0.218130 2.274 0.02298 *
## VIC_SEXM -0.067592 0.062717 -1.078 0.28115
## VIC_RACEASIAN / PACIFIC ISLANDER 11.493414 185.331308 0.062 0.95055
## VIC_RACEBLACK 11.349646 185.331248 0.061 0.95117
## VIC_RACEBLACK HISPANIC 11.197579 185.331260 0.060 0.95182
## VIC_RACEWHITE 11.349346 185.331286 0.061 0.95117
## VIC_RACEWHITE HISPANIC 11.458032 185.331255 0.062 0.95070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15321 on 13939 degrees of freedom
## Residual deviance: 15111 on 13913 degrees of freedom
## AIC: 15165
##
## Number of Fisher Scoring iterations: 11
In this report, I did some analysis on NYPD shooting trends. There appeared to be a steady decline in both shootings and murders until 2020, when there was a sharp increase. Additionally, we examined the time of year and time of day when shootings are most likely to occur. The data show that the summertime and during the evening/nighttime hours are the most dangerous. Finally, I created a logistic regression model to predict whether a shooting was a murder or not. The only statistically significant predictors from the candidate predictors were related to the age of the perp and victim.
There are a few areas of potential bias in these data. There are likely shootings that go unreported to the NYPD, and hence would not be in this data. There may be some boroughs that report shootings at a higher rate than others.